1 Setup

1.1 Load libraries

1.2 All parameters

1.2.1 File paths

##### If you have M3 google drive and bitbucket repo on your local machine, 
##### you only need to change two parameters below
# M3-shared google drive location
#M3folder_loc_for_ps='/Google Drive/M3-shared/V4/Data/200312_ASVdata8_updateAsteria/ps_notnorm_age_filter_complete_family.rds'

# Loading ps using actual filepath for analysis (To change depending on the user)
ps_not_norm_comp <- readRDS("~/M3_Datasets/ps_not_norm_age_filter_complete_family.rds")

## Output data location (subject to change)
#output_data=paste0(M3folder_loc, 'Data/V4/180808_ASVdata4/OutputData_Agefiltered/')
output_data <- "~/M3_Datasets/"

1.2.2 Cutoff parameters

# min post DADA2 counts to run analysis
min_postDD=20000
# DESeq significant cutoff
deseq_cut=0.05
# metagenomeSeq significant cutoff
mtgseq_cut=0.05
# chisquare test cutoff (for diet questionnare results significance)
chisq_cut=0.05
# PERMANOVA pvalue and R2 cutoff for visualization
permanova_pcut=0.05
permanova_cut=0.1

2 Summarize metadata

# chisquared test function
run_chisq_test <- function(ps, metacol){
  # ps: phyloseq object
  # metacol: metadata column name to test
  metaDF <- data.frame(sample_data(ps))
  # remove NAs, for some reason, some NA are recorded as a text!
  submetaDF=metaDF[!is.na(metaDF[, metacol]), ]
  submetaDF=submetaDF[submetaDF[, metacol]!='NA', ]
  submetaDF=submetaDF[submetaDF[, metacol]!='', ]   # also remove blank
  # chisquared test
  chisq_res=chisq.test(table(submetaDF[, metacol], submetaDF[, 'phenotype']))
  # extract results
  resDT=data.table(chisq_res$observed)
  # dcast for printing
  resDT <- data.table(dcast(resDT, V1 ~ V2, value.var='N'))
  resDT <- resDT[, testvar:=metacol]
  resDT <- resDT[, chisq_p:=chisq_res$p.value]
  
  return(resDT[, list(testvar, category=V1, A, N, chisq_p)])
}

# composition plot function
plot_composition <- function(chisq_resDT, var_name){
  # chisq_resDT: 4 columns. testvar, category, A, N, chisq_p
  plotDT=melt(chisq_resDT, id.vars=c('testvar', 'category', 'chisq_p'))
  p=ggplot(data=plotDT[testvar==var_name], aes(x=variable, y=value, fill=category))+
    geom_bar(stat="identity")+
    xlab('')+ylab('Number of sample')+
    ggtitle(var_name)+
    theme_minimal()+
    theme(legend.title=element_blank(), legend.position="bottom", axis.text.x=element_text(vjust=1, hjust=1))+
    scale_fill_manual(values=sgColorPalette)
  print(p)
}

2.1 Demographics

2.1.1 Biological.sex

# run chisquared test for all qutegoty data in the dataframe 
#fixing the mapping file for stats by adding categorical vs non catergorical
metadata_ok<-sample_data(ps_not_norm_comp)
fix_metada<-apply(metadata_ok, 2, function(x) {tmp<-as.numeric(x)})
rownames(fix_metada)<-rownames(metadata_ok)
fix_metada<-as.data.frame(fix_metada)
for (i in 1:ncol(fix_metada)){
  if (all(is.na(fix_metada[,i]))) 
    {fix_metada[,i] <-metadata_ok[,i]  
    fix_metada[,i]<-as.factor(fix_metada[,i])} 
}
metadata_ok<-fix_metada
sample_data(ps_not_norm_comp) <- metadata_ok

#now let's only run the categorical values for chi square and remove the first column which are not metadata 
num_cat<-names(Filter(is.numeric, metadata_ok))
fac_cat<-names(Filter(is.factor, metadata_ok))
#removing the first 13 columns, since it's not metadata and the last one which is phenotype  
fac_cat<-fac_cat[-c(1:13, length(fac_cat))]
#finally remiving the ones that were only asked for the children with ASD, or only have one factor & NA
fac_cat<-fac_cat[-which(fac_cat %in% c("Behavior.video.submitted..M3.","Language.ability.and.use","Conversation.ability","Understands.speech","Plays.imaginatively.when.alone","Plays.imaginatively.with.others","Plays.in.a.group.with.others","Eye.contact.finding","Childhood.behavioral.development.finding","Repetitive.motion","Picks.up.objects.to.show.to.others","Sleep.pattern.finding","Response.to.typical.sounds","Self.injurious.behavior.finding","Gastrointestinal.problems..M3.", "Imitation.behavior", "Other.stool.sample.collection.method.explained..M3.", "Flu.shot.in.the.last..MFlu.shot.in.the.last..M3.", "Pica.disease", "Additional.info.affecting.microbiome..M3.", "Dietary.restrictions.details..M3."))]

#Also remove the ones with only one factor (no chi-square possible)
#now running the chisquare on all categorical values 
chisquare_p.val=c()
names_chisquare_p.val=c()
all_chisquare=list()
for (i in 1:length(fac_cat)){
  if (length(levels(metadata_ok[,fac_cat[i]])) > 1) 
    {cat(i," ")
    tmp<-run_chisq_test(ps_not_norm_comp, fac_cat[i])
    chisquare_p.val<-c(chisquare_p.val,min(tmp$chisq_p))
    names_chisquare_p.val<-c(names_chisquare_p.val,fac_cat[i])
    all_chisquare[[i]]<-tmp}
}
## 3  4  5  6  7  8  9  10  11  12  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  54  55  56  57  60  61  62  63  64  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  84  86  87  88  94  99  120  245  246  247  298
names(chisquare_p.val)<-names_chisquare_p.val
names(all_chisquare) <-fac_cat

# p-value correction
chisquare_p.val<-p.adjust(chisquare_p.val)
chisquare_p.val<-chisquare_p.val[chisquare_p.val < 0.05]

#these two were included in chisquare despite being equivalent to phenotype and need to be removed
chisquare_p.val <-chisquare_p.val[-which(names(chisquare_p.val) %in% c("Host.disease.status", "phenotype"))]

length(chisquare_p.val) #31
## [1] 31
chisquare_p.val
##                                 Functional.bowel.finding 
##                                             1.225404e-09 
##                                    Specific.food.allergy 
##                                             1.990454e-02 
##                                           Dietary.regime 
##                                             1.139866e-02 
##                         GI.symptoms.within.3.months..M3. 
##                                             1.143152e-16 
##                                           Biological.sex 
##                                             7.641310e-11 
##                                 GI.issues.this.week..M3. 
##                                             1.828008e-11 
##                                           Gluten.allergy 
##                                             1.356088e-02 
##                            Non.celiac.gluten.sensitivity 
##                                             9.847156e-06 
##                      Whole.grain..consumption.frequency. 
##                                             6.734235e-03 
##                            Dairy..consumption.frequency. 
##                                             1.366800e-06 
##                            Fruit..consumption.frequency. 
##                                             3.263001e-09 
##           Meals.prepared.at.home..consumption.frequency. 
##                                             4.822768e-03 
##               Ready.to.eat.meals..consumption.frequency. 
##                                             1.725369e-02 
##                        Vegetable..consumption.frequency. 
##                                             1.623494e-07 
##                                      Lactose.intolerance 
##                                             1.045148e-04 
##                        Probiotic..consumption.frequency. 
##                                             4.294707e-08 
##                                Dietary.restrictions..M3. 
##                                             1.097744e-09 
##                                       Dietary.supplement 
##                                             1.731861e-11 
##     Vitamin.B.complex.supplement..consumption.frequency. 
##                                             2.019450e-09 
##                        Vitamin.D..consumption.frequency. 
##                                             6.852504e-10 
##                                      LR6.prediction..M3. 
##                                             1.731389e-16 
##                                     LR10.prediction..M3. 
##                                             4.762427e-16 
##                                      LR5.prediction..M3. 
##                                             3.342112e-18 
##                                           Toilet.trained 
##                                             4.070706e-03 
##                            Other.symptoms.this.week..M3. 
##                                             4.837634e-02 
##                      Recent.anxiety..caretaker.reported. 
##                                             2.582502e-03 
## Meats.and.seafood..consumption.frequency...longitudinal. 
##                                             4.027703e-02 
##         Vegetable..consumption.frequency...longitudinal. 
##                                             1.635774e-05 
##             Fruit..consumption.frequency...longitudinal. 
##                                             3.502294e-05 
##                                        Toilet.cover..M3. 
##                                             1.205660e-03 
##                     Most.recent.GI.episode.symptoms..M3. 
##                                             9.103737e-05
#vizualisation of the results 
#select only the signififcant ones
all_chisquare<-all_chisquare[names(all_chisquare) %in% names(chisquare_p.val)]
#save this into a csv
write.csv(format(chisquare_p.val, digits=2), file=paste0(output_data,"Xsqr_05.csv"), quote=F)
# plot one example out of 29 
plot_composition(all_chisquare[1], names(all_chisquare)[1])

2.1.2 Racial.group

# print number table
table(sample_data(ps_not_norm_comp)$Racial.group, sample_data(ps_not_norm_comp)$Biological.sex)
##                                         
##                                          Female Male
##   Asian race                                 15   33
##   Asian race & Middle Eastern race            3    3
##   Asian race & Unknown racial group           3    3
##   Caucasian                                  99  213
##   Caucasian & Asian race                      3    3
##   Caucasian & Hispanic                        0   12
##   Caucasian & Indian (subcontinent) race      0    6
##   Hispanic                                    6   18
##   Hispanic & African race                     6    0
##   Indian (subcontinent) race                  3    3
##   Indian race                                 6    6
##   Unknown racial group                        0   18
# run chisquared test
race=run_chisq_test(ps_not_norm_comp, 'Racial.group')
# print results
pander(race)
testvar category A N chisq_p
Racial.group Asian race 24 24 1
Racial.group Asian race & Middle Eastern race 3 3 1
Racial.group Asian race & Unknown racial group 3 3 1
Racial.group Caucasian 156 156 1
Racial.group Caucasian & Asian race 3 3 1
Racial.group Caucasian & Hispanic 6 6 1
Racial.group Caucasian & Indian (subcontinent) race 3 3 1
Racial.group Hispanic 12 12 1
Racial.group Hispanic & African race 3 3 1
Racial.group Indian (subcontinent) race 3 3 1
Racial.group Indian race 6 6 1
Racial.group Unknown racial group 9 9 1
# plot
plot_composition(race, 'Racial.group')

# % table
race_prop=prop.table(as.matrix(race[, .(A, N)]), margin=2)*100
row.names(race_prop) <- race$category
pander(race_prop)
  A N
Asian race 10.39 10.39
Asian race & Middle Eastern race 1.299 1.299
Asian race & Unknown racial group 1.299 1.299
Caucasian 67.53 67.53
Caucasian & Asian race 1.299 1.299
Caucasian & Hispanic 2.597 2.597
Caucasian & Indian (subcontinent) race 1.299 1.299
Hispanic 5.195 5.195
Hispanic & African race 1.299 1.299
Indian (subcontinent) race 1.299 1.299
Indian race 2.597 2.597
Unknown racial group 3.896 3.896
# write
write.csv(race_prop, file=paste0(output_data, 'Race.csv'))

2.1.3 Age..months.

# make sure it is numeric
sample_data(ps_not_norm_comp)$Age..months. <- as.numeric(sample_data(ps_not_norm_comp)$Age..months.)

# plot
ggplot(data=sample_data(ps_not_norm_comp), aes(x=phenotype, y=Age..months., fill=phenotype))+
  geom_boxplot(width=0.7, outlier.colour='white')+
  geom_jitter(size=1, position=position_jitter(width=0.1))+
  xlab('')+ylab('Age (months)')+
  scale_fill_manual(values=sgColorPalette)+
  theme_minimal()

# run tests to check significance
shapiro.test(sample_data(ps_not_norm_comp)$Age..months.) #not normal we need a reanking test
## 
##  Shapiro-Wilk normality test
## 
## data:  sample_data(ps_not_norm_comp)$Age..months.
## W = 0.974, p-value = 2.535e-07
wilcox.test(Age..months. ~ phenotype, data=data.frame(sample_data(ps_not_norm_comp)), var.equal=FALSE)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Age..months. by phenotype
## W = 31779, p-value = 0.0003799
## alternative hypothesis: true location shift is not equal to 0
#Let's generalized all the values with numeric input
num_cat<-num_cat[-which(num_cat %in% c("Host.ID","Family.group.ID","Biospecimen.ID", "Mobile.Autism.Risk.Assessment.Score"))]
wilcox_pval=c()
# Only doing 1:22 since NAs in last 5 variables are causing errors in wilcox
for (i in 1:22){
  #if (levels(get(num_cat[i], metadata_ok)) >= 2)
  tmp<-wilcox.test(get(num_cat[i]) ~ phenotype, data=metadata_ok, var.equal=FALSE)
  wilcox_pval<-c(wilcox_pval,tmp$p.value)
}
names(wilcox_pval)<-num_cat[1:22]
#correction 
wilcox_pval<-p.adjust(wilcox_pval)
wilcox_pval[wilcox_pval<0.05] #age and LRprobabilities 
##                          <NA>                          <NA> 
##                            NA                            NA 
##                          <NA>                  Age..months. 
##                            NA                  4.938530e-03 
##  LR6.probability.not.ASD..M3.      LR6.probability.ASD..M3. 
##                  1.459578e-21                  1.459578e-21 
## LR10.probability.not.ASD..M3.     LR10.probability.ASD..M3. 
##                  1.555606e-23                  1.555606e-23 
##  LR5.probability.not.ASD..M3.      LR5.probability.ASD..M3. 
##                  3.320889e-20                  3.320889e-20 
##                   Age..years. 
##                  4.938530e-03

2.2 Dietary variance

Dietary variance amongst ASD patients will also be assessed. Based on preliminary analyses, we expect that ASD participants, collectively, will exhibit a minimal degree of dietary variance.

# read metadata category file
#meta_cat=fread(paste0(M3folder_loc, meta_cat_fp), header=TRUE)

#Created this csv from the file in the shared M3 google drive, took the first sheet and removed the last incomplete column, then converted to csv
meta_cat<-read.csv("updated_metacategories.csv")
colnames(meta_cat)[1] <- "varname"

# list of diet questions
diet_info<-metadata_ok[,colnames(metadata_ok) %in% meta_cat$varname[which(meta_cat$diet==TRUE)]]
#additionnal error to remove: filled with only NA or one factor, cant do chisquare on one factor
diet_col_levels<-sapply(diet_info, levels)
dietcol_levelscount<-sapply(diet_col_levels, length)

#Since there numerics dietary variables are only two and filled primarily with NAs (as shown by line 248, we will omit)
#diet_info[,which(sapply(diet_info, class) == "numeric")]
diet_info <- diet_info[,which(dietcol_levelscount >= 2)]

#dietq_col <-which(colnames(sample_data(ps_not_norm_comp)) %in% colnames(diet_info))
dietq_col <- colnames(diet_info)

# for each variable, summarize and check if A vs N different? (we hypothesized variance in ASD are the same as NT?)
master_resDT=NULL
for(i in dietq_col){
  resDT=run_chisq_test(ps_not_norm_comp, i)
# add to master
  master_resDT <- rbindlist(list(master_resDT, resDT))
}

# variables tested
unique(master_resDT$testvar)
##  [1] "Dietary.regime"                                            
##  [2] "Whole.grain..consumption.frequency."                       
##  [3] "Fermented.vegetable..consumption.frequency."               
##  [4] "Dairy..consumption.frequency."                             
##  [5] "Fruit..consumption.frequency."                             
##  [6] "Meals.prepared.at.home..consumption.frequency."            
##  [7] "Ready.to.eat.meals..consumption.frequency."                
##  [8] "Meat..consumption.frequency."                              
##  [9] "Olive.oil.used.in.cooking..M3."                            
## [10] "Seafood..consumption.frequency."                           
## [11] "Sweetened.drink..consumption.frequency."                   
## [12] "Vegetable..consumption.frequency."                         
## [13] "Restaurant.prepared.meals..consumption.frequency."         
## [14] "Sugary.food..consumption.frequency."                       
## [15] "Multivitamin"                                              
## [16] "Probiotic..consumption.frequency."                         
## [17] "Dietary.supplement"                                        
## [18] "Vitamin.B.complex.supplement..consumption.frequency."      
## [19] "Vitamin.D..consumption.frequency."                         
## [20] "Starchy.food..consumption.frequency...longitudinal."       
## [21] "Meats.and.seafood..consumption.frequency...longitudinal."  
## [22] "Dietary.fat.and.oil..consumption.frequency...longitudinal."
## [23] "Vegetable..consumption.frequency...longitudinal."          
## [24] "Fruit..consumption.frequency...longitudinal."              
## [25] "Red.meat..consumption.frequency."
# order by significance
master_resDT <- master_resDT[order(chisq_p)]

# print table
datatable(master_resDT)
# write csv file
write.csv(master_resDT, file=paste0(output_data, 'Dietary_var_all_proportion.csv'))
write.csv(unique(master_resDT[, list(testvar, chisq_p)]), file=paste0(output_data, 'Dietary_var_chisq_test.csv'))

# plot top 3 most significant vars
plot_diet=master_resDT[testvar %in% unique(master_resDT[, list(testvar, chisq_p)])$testvar[1:3]]
for(i in unique(plot_diet$testvar)){
  plot_composition(plot_diet, i)
}

3 Normalization

dir.create(paste0(output_data, 'Normalized/'))

#Filtering of the prevalence: 
###Declare function to filter
filterTaxaByPrevolence <- function(ps, percentSamplesPresentIn){
  prevalenceThreshold <- percentSamplesPresentIn * nsamples(ps)
  toKeep <- apply(data.frame(otu_table(ps)), 1, function(taxa) return(sum(taxa > 0) > prevalenceThreshold))
  ps_filt <- prune_taxa(toKeep, ps)
  return(ps_filt)
}

#CSS norm function
#We actually will plot everything with CSS 
CSS_norm<-function(ps){
  ps.metaG<-phyloseq_to_metagenomeSeq(ps)
  p_stat = cumNormStatFast(ps.metaG)
  ps.metaG = cumNorm(ps.metaG, p = p_stat)
  ps.metaG.norm <- MRcounts(ps.metaG, norm = T)
  ps_CSS<-phyloseq(otu_table(ps.metaG.norm, taxa_are_rows = T), sample_data(ps),tax_table(ps))
  return(ps_CSS)
}

#Deseq norm 
deSeqNorm <- function(ps){
ps_dds <- phyloseq_to_deseq2(ps, ~ phenotype)
ps_dds <- estimateSizeFactors(ps_dds, type = "poscounts")
ps_dds <- estimateDispersions(ps_dds)
abund <- getVarianceStabilizedData(ps_dds)
abund <- abund + abs(min(abund)) #don't allow deseq to return negative counts
ps_deSeq <- phyloseq(otu_table(abund, taxa_are_rows = T), sample_data(ps), tax_table(ps))
return(ps_deSeq)
}

#Now we remove the taxa present in less than 3 % of the samples with some basic filtering 
filtered_ps003<-filterTaxaByPrevolence(ps_not_norm_comp, 0.03)
filtered_ps003
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 886 taxa and 462 samples ]
## sample_data() Sample Data:       [ 462 samples by 364 sample variables ]
## tax_table()   Taxonomy Table:    [ 886 taxa by 8 taxonomic ranks ]
saveRDS(filtered_ps003, file=paste0(output_data, "Normalized/ps_not_norm_comp_pass_min_postDD_min0.03.Rda"))

# CSS normalization
ps_CSS_norm_pass_min_postDD_sup003<-CSS_norm(filtered_ps003)
saveRDS(ps_CSS_norm_pass_min_postDD_sup003, file=paste0(output_data, "Normalized/ps_CSS_pass_min_postDD_min0.03.Rda"))

# DESeq normalization
ps_DeSeq_norm_pass_min_postDD_sup003<-deSeqNorm(filtered_ps003)
saveRDS(ps_DeSeq_norm_pass_min_postDD_sup003, file=paste0(output_data, "Normalized/ps_DeSeq_pass_min_postDD_min0.03.Rda"))

# TSS normalization
propDF=prop.table(as.matrix(otu_table(filtered_ps003)), margin=2)
ps_TSS_norm_pass_min_postDD_sup003 <- phyloseq(otu_table(propDF, taxa_are_rows=TRUE), 
                                               tax_table(filtered_ps003), 
                                               sample_data(filtered_ps003))

3.0.1 Repeated measures ANOVA to find ASVs that have means that change significantly between timepoints

#format asv table with timepoint + hostname info
asv_table<-t(otu_table(ps_DeSeq_norm_pass_min_postDD_sup003))
asv_table <- as.data.frame(asv_table)
asv_table$timepoint <- ps_DeSeq_norm_pass_min_postDD_sup003@sam_data$Within.study.sampling.date
asv_table$timepoint <- as.factor(asv_table$timepoint)
asv_table$Host.Name <- ps_DeSeq_norm_pass_min_postDD_sup003@sam_data$Host.Name

#Create initial table of first asv to build off of
tmp <-summary(aov(asv_table[,1] ~ timepoint + Error(Host.Name/timepoint), data = asv_table))  
p_val <-tmp$`Error: Host.Name:timepoint`[[1]]$`Pr(>F)`[1]
anova_asv_res <- tibble(colnames(asv_table[1]), p_val)
colnames(anova_asv_res) <- c("ASV", "p_val")

#Run for each asv
for (i in 2:(length(colnames(asv_table))-2)) {
  form <-as.formula(paste0("asv_table[," , i, "]", " ~ timepoint + Error(Host.Name/timepoint)"))
  tmp <-summary(aov(formula = form, data = asv_table))  
  p_val <-tmp$`Error: Host.Name:timepoint`[[1]]$`Pr(>F)`[1]
  tmpres <- tibble(colnames(asv_table[i]), p_val)
  colnames(tmpres) <- c("ASV", "p_val")
  anova_asv_res<- rbind(anova_asv_res, tmpres)
}

#find sig ones btwn timepoints
asv_sig_btwn_timep<-anova_asv_res$ASV[which(anova_asv_res$p_val <= 0.05)]

asv_sig_btwn_timep
##  [1] "GCAAGCGTTATCCGGAATTACTGGGTGTAAAGGGTGCGTAGGTGGTATGGCAAGTCAGAAGTGAAAACCCAGAGCTTAACTCTGGGACTGCTTTTGAAACTGTCAGACTAGAGTGCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACATCAGTGGCGAAGGCGGCTTACTGGACTGAAACTGACACTGAGGCACGAAAGCGTGGGG"
##  [2] "GCAAGCGTTATCCGGAATTATTGGGCGTAAAGGGTACGTAGGTGGTTACCTAAGCACAGGGTATAAGGCAATAGCTTAACTATTGTTCGCCTTGTGAACTGGGCTACTTGAGTGCAGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACTGTAACTGACACTGAGGTACGAAAGCGTGGGG" 
##  [3] "GCAAGCGTTATCCGGATTTACTGGGCGTAAAGGGAGCGTAGGCGGATATTTAAGTGGGATGTGAAATACCTGAGCTTAACTTGGGAGCTGCATTCCAAACTGGATATCTAGAGTGCAGGAGAGGAGAATGGAATTCCTAGTGTAGCGGTGAAATGCGTAGAGATTAGGAAGAACACCAGTGGCGAAGGCGATTCTCTGGACTGTAACTGACGCTGAGGCTCGAAAGCGTGGGG"
##  [4] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGGACTGCAAGTTGGATGTGAAATACCGTGGCTTAACCACGGAACTGCATCCAAAACTGTAGTTCTTGAGTGAAGTAGAGGCAAGCGGAATTCCGAGTGTAGCGGTGAAATGCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGGCTTTAACTGACGCTGAGGCTCGAAAGTGTGGGG"
##  [5] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGAGCAGCAAGTCTGATGTGAAAACCCGGGGCTCAACTCCGGGACTGCATTGGAAACTGTTGATCTGGAGTGCCGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTGGGG"
##  [6] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGTGTGACAAGTCTGATGTGAAAGGCATGGGCTCAACCTGTGGACTGCATTGGAAACTGTCATACTTGAGTGCCGGAGGGGTAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTGGGG"
##  [7] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGTATGGCAAGTCTGATGTGAAAGGCCGGGGCTCAACCCCGGGACTGCATTGGAAACTGCCAGACTAGAGTGTCGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGACAACTGACGCTGAGGCTCGAAAGCGTGGGG"
##  [8] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGTGTAGGTGGCCATGCAAGTCAGAAGTGAAAATCCGGGGCTCAACCCCGGAACTGCTTTTGAAACTGTGAGGCTAGAGTGCAGGAGGGGTGAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGACTGTAACTGACACTGAGGCTCGAAAGCGTGGGG"
##  [9] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGTGCGTAGGTGGTATGGCAAGTCAGAAGTGAAAGGCTGGGGCTCAACCCCGGGACTGCTTTTGAAACTGTCAAACTAGAGTACAGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACTGAAACTGACACTGAGGCACGAAAGCGTGGGG"
## [10] "GCAAGCGTTATCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCTCGGCTTAACCGAGGAAGTGCATCGGAAACTGGGAAACTTGAGTACAGAAGAGGACAGTGGAACTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAAGAACACCAGTGGCGAAGGCGGCTGTCTGGTCTGTAACTGACGCTGAGGCTCGAAAGCATGGGT"
## [11] "GCAAGCGTTATCCGGATTTATTGGGTGTAAAGGGTGCGTAGACGGGAAGGTAAGTTAGTTGTGAAATCCCTCGGCTCAACTGAGGAACTGCGACTAAAACTGCTTTTCTTGAGTGCTGGAGAGGAAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGACAGCAACTGACGTTGAGGCACGAAAGTGTGGGG"
## [12] "GCAAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATTCCAAGTCAGCGGTGAAATCTCCATGCTCAACATGGACACTGCCGTTGAAACTGGCGTTCTAGAGTGTAAATGAGGTAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCCTGCTGGGATACAACTGACGCTGAGGCACGAAAGCGTGGGT"
## [13] "GCAAGCGTTGTCCGGAATAATTGGGCGTAAAGGGCGCGTAGGCGGCTCGGTAAGTCTGGAGTGAAAGTCCTGCTTTTAAGGTGGGAATTGCTTTGGATACTGTCGGGCTTGAGTGCAGGAGAGGTTAGTGGAATTCCCAGTGTAGCGGTGAAATGCGTAGAGATTGGGAGGAACACCAGTGGCGAAGGCGACTAACTGGACTGTAACTGACGCTGAGGCGCGAAAGTGTGGGG"
## [14] "GCAAGCGTTGTCCGGAATTACTGGGCGTAAAGGGTGCGTAGGTGGCCATGTAAGTCAGGTGTGAAAGACCGGGGCTCAACCCCGGGGTTGCACTTGAAACTGTGTGGCTTGAGTACAGGAGAGGGAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGACTGTAACTGACACTGAGGCACGAAAGCGTGGGG"
## [15] "GCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAGAGCAAGTTGGAAGTGAAATCTGTGGGCTCAACTCACAAATTGCTTTCAAAACTGTTTTTCTTGAGTGGTGTAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACTAACTGACGCTGAGGCTCGAAAGCATGGGT"
## [16] "GCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGATCGTAAGTTGGGAGTGAAATTCATGGGCTCAACCCATGACCTGCTTTCAAAACTGCGATTCTTGAGTGGTGTAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACTAACTGACGCTGAGGCTCGAAAGCATGGGT"
## [17] "GCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGGAGGCAAGTTGAATGTCTAAACTATCGGCTCAACTGATAGTCGCGTTCAAAACTGCCACTCTTGAGTGCAGTAGAGGTAGGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCTGTAACTGACGCTGAGGCTCGAAAGCGTGGGT" 
## [18] "GCAAGCGTTGTCCGGAATTATTGGGCGTAAAGCGCGCGCAGGCGGATAGGTCAGTCTGTCTTAAAAGTTCGGGGCTTAACCCCGTGATGGGATGGAAACTGCCAATCTAGAGTATCGGAGAGGAAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAAGAACACCAGTGGCGAAGGCGACTTTCTGGACGAAAACTGACGCTGAGGCGCGAAAGCCAGGGG" 
## [19] "GCAAGCGTTGTCCGGATTTACTGGGCGTAAAGGGAGCGTAGGCGGATTTTTAAGTGGGATGTGAAATACCCGGGCTCAACCTGGGTGCTGCATTCCAAACTGGAAATCTAGAGTGCAGGAGGGGAAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGAGATTAGGAAGAACACCAGTGGCGAAGGCGACTTTCTGGACTGTAACTGACGCTGAGGCTCGAAAGCGTGGGG"
## [20] "GCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGCCGGAGAGACAAGTCAGATGTGAAATCCGCGGGCTCAACCCGCGAACTGCATTTGAAACTGTTTCCCTTGAGTATCGGAGAGGTAACCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAAGAACACCAGTGGCGAAGGCGGGTTACTGGACGACAACTGACGGTGAGGCGCGAAAGCGTGGGG"
## [21] "GCAAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGAAGAATAAGTCTGATGTGAAAGCCCTCGGCTTAACCGAGGAACTGCATCGGAAACTGTTTTTCTTGAGTGCAGAAGAGGAGAGTGGAACTCCATGTGTAGCGGTGGAATGCGTAGATATATGGAAGAACACCAGTGGCGAAGGCGGCTCTCTGGTCTGCAACTGACGCTGAGGCTCGAAAGCATGGGT"
## [22] "GCGAGCGTTATCCGGAATCATTGGGCGTAAAGAGGGAGCAGGCGGCAATAGAGGTCTGCGGTGAAAGCCTGAAGCTAAACTTCAGTAAGCCGTGGAAACCAAATAGCTAGAGTGCAGTAGAGGATCGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGACGATCTGGGCTGCAACTGACGCTCAGTCCCGAAAGCGTGGGG" 
## [23] "GCGAGCGTTATCCGGAATTATTGGGCGTAAAGAGCGCGCAGGTGGTTGATTAAGTCTGATGTGAAAGCCCACGGCTTAACCGTGGAGGGTCATTGGAAACTGGTCAACTTGAGTGCAGAAGAGGGAAGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGAGATATGGAGGAACACCAGTGGCGAAGGCGGCTTCCTGGTCTGTAACTGACACTGAGGCGCGAAAGCGTGGGG"
## [24] "GCGAGCGTTATCCGGAATTATTGGGCGTAAAGAGTACGTAGGCGGTTTTTTAAGCGAGGGGTATAAGGCAGCGGCTTAACTGCTGTTGGCCCCTCGAACTGGAGGACTTGAGTGTCGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGAGATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACGACAACTGACGCTGAGGTACGAAAGCGTGGGG" 
## [25] "GCGAGCGTTATCCGGAATTATTGGGTGTAAAGGGTGCGTAGGCGGGATGTAAAGTCAGATGTGAAATGCCGCGGCTCAACCGCGGAGCTGCATTTGAAACTTATGTTCTTGAGTGAAGTAGAGGTAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGGCTTACTGGGCTTAGACTGACGCTGAGGCACGAAAGTGTGGGG"
## [26] "GCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGGCTTTTAAGTCAGCGGTCAAATGTCACGGCTCAACCGTGGCCAGCCGTTGAAACTGCAAGCCTTGAGTCTGCACAGGGCACATGGAATTCGTGGTGTAGCGGTGAAATGCTTAGATATCACGAAGAACTCCGATCGCGAAGGCATTGTGCCGGGGCATAACTGACGCTGAGGCTCGAAAGTGCGGGT" 
## [27] "GCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGCGCGTAGGCGGGACGTCAAGTCAGCGGTAAAAGACTGCAGCTAAACTGTAGCACGCCGTTGAAACTGGCGCCCTCGAGACGAGACGAGGGAGGCGGAACAAGTGAAGTAGCGGTGAAATGCATAGATATCACTTGGAACCCCGATAGCGAAGGCAGCTTCCCAGGCTCGATCTGACGCTGATGCGCGAGAGCGTGGGT" 
## [28] "GCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGTTTATTAAGTTAGTGGTTAAATATTTGAGCTAAACTCAATTGTGCCATTAATACTGGTGAACTGGAGTACAGACGAGGTAGGCGGAATAAGTTAAGTAGCGGTGAAATGCATAGATATAACTTAGAACTCCGATAGCGAAGGCAGCTTACCAGACTGTAACTGACGCTGATGCACGAGAGCGTGGGT" 
## [29] "GCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGTGGTGATTTAAGTCAGCGGTGAAAGTTTGTGGCTCAACCATAAAATTGCCGTTGAAACTGGGTTACTTGAGTGTGTTTGAGGTAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCTTACTAAACCATAACTGACACTGAAGCACGAAAGCGTGGGG"
## [30] "GCGAGCGTTGTCCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGTTTAACAAGTCCAATGTGAAATACCCGGGCTTAACCTGGGGGGTGCATTGGAAACTGTTAGACTTGAGTGCAGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACTGCAACTGACGCTGAGGCACGAAAGCGTGGGG"
## [31] "GCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATAGCAAGTCAGATGTGAAAACTATGGGCTCAACCTGTAGATTGCATTTGAAACTGTTGTTCTTGAGTGAAGTAGAGGTAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACATCGGTGGCGAAGGCGGCTTACTGGGCTTTTACTGACGCTGAGGCTCGAAAGCGTGGGG"
## [32] "GCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATGGCAAGTTGGATGTTTAAACTAACGGCTCAACTGTTAGGTGCATCCAAAACTGCTGTTCTTGAGTGAAGTAGAGGCAGGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGGCTTTAACTGACGCTGAGGCTCGAAAGCGTGGGG" 
## [33] "t__217134"                                                                                                                                                                                                                                
## [34] "t__262755"                                                                                                                                                                                                                                
## [35] "t__264494"                                                                                                                                                                                                                                
## [36] "t__520"
#remove these taxa from phyloseqs (since they are noise essentially)
filtered_ps003 <-prune_taxa(filtered_ps003, taxa = taxa_names(filtered_ps003)[!(taxa_names(filtered_ps003) %in% asv_sig_btwn_timep)])

ps_CSS_norm_pass_min_postDD_sup003<- prune_taxa(ps_CSS_norm_pass_min_postDD_sup003, taxa = taxa_names(ps_CSS_norm_pass_min_postDD_sup003)[!(taxa_names(ps_CSS_norm_pass_min_postDD_sup003) %in% asv_sig_btwn_timep)])

ps_DeSeq_norm_pass_min_postDD_sup003<- prune_taxa(ps_DeSeq_norm_pass_min_postDD_sup003, taxa = taxa_names(ps_DeSeq_norm_pass_min_postDD_sup003)[!(taxa_names(ps_DeSeq_norm_pass_min_postDD_sup003) %in% asv_sig_btwn_timep)])

4 DA taxa identification

Identify bacterial and archaeal taxa (genera, species and strains) whose abundance is observed significantly more or less in the ASD

4.1 DESeq

dir.create(paste0(output_data, 'DESeq/'))
###Run DESeq proper (not the normalization but all of it)
runDESeq <- function(ps, dcut){
  diagdds = phyloseq_to_deseq2(ps, ~ phenotype) 
  diagdds <- estimateSizeFactors(diagdds, type = "poscounts")
  diagdds <- DESeq(diagdds,fitType="parametric", betaPrior = FALSE) 
  res = results(diagdds, contrast = c("phenotype", "N", "A"))
  res$padj[is.na(res$padj)] = 1
  sig <- res[res$padj < dcut,]
  if (dim(sig)[1] == 0) 
  {sigtab<- as.data.frame(1, row.names="nothing")
    colnames(sigtab) <- 'padj'}
    else 
    {
      sigtab <- data.frame(sig)
      }
  return(list(res, sigtab))
}


###Running analysis 
###split thedata based on the real 3 timepoints
P1<-prune_samples(rownames(sample_data(filtered_ps003))[sample_data(filtered_ps003)$Within.study.sampling.date == "Timepoint 1"], filtered_ps003)
P2<-prune_samples(rownames(sample_data(filtered_ps003))[sample_data(filtered_ps003)$Within.study.sampling.date == "Timepoint 2"], filtered_ps003)
P3<-prune_samples(rownames(sample_data(filtered_ps003))[sample_data(filtered_ps003)$Within.study.sampling.date == "Timepoint 3"], filtered_ps003)

#several significants 
deseq_res_P1 <- runDESeq(P1, deseq_cut) 
deseq_res_P2 <- runDESeq(P2, deseq_cut)
deseq_res_P3 <- runDESeq(P3, deseq_cut)

# print significant taxa
datatable(deseq_res_P1[[2]])
datatable(deseq_res_P2[[2]])
datatable(deseq_res_P3[[2]])
#"ASV_1669" present twice timepoint 1 and 3 

# save
saveRDS(deseq_res_P1, file=paste0(output_data, "DESeq/deseq_res_P1_adjp", deseq_cut, ".Rda"))
saveRDS(deseq_res_P2, file=paste0(output_data, "DESeq/deseq_res_P2_adjp", deseq_cut, ".Rda"))
saveRDS(deseq_res_P3, file=paste0(output_data, "DESeq/deseq_res_P3_adjp", deseq_cut, ".Rda"))

#Working with time series
#according to the DeSeq vignette: design including the time factor, and then test using the likelihood ratio test as described
#the following section, where the time factor is removed in the reduced formula
runDESeq_time <- function(ps, dcut){
  diagdds = phyloseq_to_deseq2(ps, ~ phenotype + Within.study.sampling.date) 
  diagdds <- estimateSizeFactors(diagdds, type = "poscounts")
  diagdds <- DESeq(diagdds,fitType="parametric", betaPrior = FALSE) 
  #resultsNames(diagdds): to determine the constrast
  res = results(diagdds, contrast = c("phenotype", "A", "N"))
  res$padj[is.na(res$padj)] = 1
  sig <- res[res$padj < dcut,]
  if (dim(sig)[1] == 0) 
  {sigtab<- as.data.frame(1, row.names="nothing")
  colnames(sigtab) <- 'padj'}
  else 
  {
    sigtab <- data.frame(sig)
  }
  return(list(res, sigtab))
}

#and this time when factoring in the interaction for longitudinal study! 
bla<-runDESeq_time(filtered_ps003, deseq_cut)
saveRDS(bla, file=paste0(output_data, "DESeq/Deseq_time_interaction_adjp", deseq_cut, ".Rda"))

datatable(bla[2][[1]])
# significant ASVs
row.names(bla[2][[1]])
## [1] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCAGGGCAAGTCTGATGTGAAAACCCGGGGCTCAACCCCGGGACTGCATTGGAAACTGTCCGGCTGGAGTGCAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACGTTGAGGCTCGAAAGCGTGGGG"
## [2] "GCAAGCGTTGTCCGGAATTATTGGGCGTAAAGCGCGCGCAGGCGGTTTCATAAGTCTGTCTTAAAAGTGCGGGGCTTAACCCCGTGAGGGGATGGAAACTATGGAACTGGAGTATCGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAAGAACACCAGTGGCGAAGGCGGCTTTCTGGACGACAACTGACGCTGAGGCGCGAAAGCCAGGGG" 
## [3] "GCAAGCGTTGTCCGGAATTATTGGGCGTAAAGGGCGCGCAGGCGGCTTCTTAAGTCTGTCTTAAAAGTGCGGGGCTTAACCCCGTGATGGGATGGAAACTGGGAAGCTCAGAGTATCGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAAGCGGCTTTCTGGACGAAAACTGACGCTGAGGCGCGAAAGCCAGGGG"
## [4] "GCGAGCGTTATCCGGAATTATTGGGCGTAAAGAGTGCGTAGGTGGTAACTTAAGCGCGGGGTTTAAGGCAATGGCTTAACCATTGTTCGCCCTGCGAACTGGGATACTTGAGTGCAGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACTGAAACTGACACTGAGGCACGAAAGTGTGGGG" 
## [5] "t__21316"                                                                                                                                                                                                                                 
## [6] "t__262551"

4.2 metagenomeSeq

dir.create(paste0(output_data, 'metagenomseq/'))
###Run ZIG model fitting and prediction
run_metagenom_seq<-function(ps,maxit, mcut){
  p_metag<-phyloseq_to_metagenomeSeq(ps)
  #filtering at least 4 samples 
  p_metag= cumNorm(p_metag, p=0.75)
  normFactor =normFactors(p_metag)
  normFactor =log2(normFactor/median(normFactor) + 1)
  #mod = model.matrix(~ASDorNeuroT +PairASD+ normFactor)
  mod = model.matrix(~phenotype + normFactor, data = pData(p_metag))
  settings =zigControl(maxit =maxit, verbose =FALSE)
  #settings =zigControl(tol = 1e-5, maxit = 30, verbose = TRUE, pvalMethod = 'bootstrap')
  fit =fitZig(obj = p_metag, mod = mod, useCSSoffset = FALSE, control = settings)
  #Note: changed fit$taxa to fit@taxa in light of error (probably from newer metagenomeseq ver.)
  res_fit<-MRtable(fit, number = length(fit@taxa))
  res_fit_nonfiltered <- copy(res_fit)
  res_fit<-res_fit[res_fit$adjPvalues<mcut,]
  #finally remove the ones that are not with enough samples
  #mean_sample<-mean(calculateEffectiveSamples(fit))
  #res_fit<-res_fit[res_fit$`counts in group 0` & res_fit$`counts in group 1` > mean_sample,]
  Min_effec_samp<-calculateEffectiveSamples(fit)
  Min_effec_samp<-Min_effec_samp[ names(Min_effec_samp)  %in% rownames(res_fit)] #####there is a bug here 
  #manually removing the ones with "NA"
  res_fit<-res_fit[grep("NA",rownames(res_fit), inv=T),]
  res_fit$Min_sample<-Min_effec_samp
  res_fit<-res_fit[res_fit$`+samples in group 0` >= Min_effec_samp & res_fit$`+samples in group 1` >= Min_effec_samp,]
  return(list(res_fit_nonfiltered, res_fit))
}

#Now for each time
P1<-prune_samples(rownames(sample_data(filtered_ps003))[sample_data(filtered_ps003)$Within.study.sampling.date == "Timepoint 1"], filtered_ps003)
P2<-prune_samples(rownames(sample_data(filtered_ps003))[sample_data(filtered_ps003)$Within.study.sampling.date == "Timepoint 2"], filtered_ps003)
P3<-prune_samples(rownames(sample_data(filtered_ps003))[sample_data(filtered_ps003)$Within.study.sampling.date == "Timepoint 3"], filtered_ps003)

zig_res_P1 <- run_metagenom_seq(P1,30, mtgseq_cut) # 30The maximum number of iterations for the expectation-maximization algorithm
zig_res_P2 <- run_metagenom_seq(P2,30, mtgseq_cut)
zig_res_P3 <- run_metagenom_seq(P3,30, mtgseq_cut)

# print significant taxa
datatable(zig_res_P1[[2]])
datatable(zig_res_P2[[2]])
datatable(zig_res_P3[[2]])
zig_res_P1_filtered <- data.frame(cbind(zig_res_P1[[2]], tax_table(P1)[rownames(zig_res_P1[[2]]),]))
zig_res_P1_filtered$enriched <- ifelse(zig_res_P1_filtered$phenotypeN < 0, "Aut", "Control")

zig_res_P2_filtered <- data.frame(cbind(zig_res_P2[[2]], tax_table(P2)[rownames(zig_res_P2[[2]]), ]))
zig_res_P2_filtered$enriched <- ifelse(zig_res_P2_filtered$phenotypeN < 0, "Aut", "Control")

zig_res_P3_filtered <- data.frame(cbind(zig_res_P3[[2]], tax_table(P3)[rownames(zig_res_P3[[2]]), ]))
zig_res_P3_filtered$enriched <- ifelse(zig_res_P3_filtered$phenotypeN < 0, "Aut", "Control")


saveRDS(zig_res_P1, file=paste0(output_data, "metagenomseq/zig_res_P1_adjp", mtgseq_cut, ".rds"))
saveRDS(zig_res_P2, file=paste0(output_data, "metagenomseq/zig_res_P2_adjp", mtgseq_cut, ".rds"))
saveRDS(zig_res_P3, file=paste0(output_data, "metagenomseq/zig_res_P3_adjp", mtgseq_cut, ".rds"))

#do we have any in ESV in common?
#Reduce(intersect, list(rownames(zig_res_P1_filtered),rownames(zig_res_P2_filtered),rownames(zig_res_P3_filtered)))
rownames(zig_res_P1[[2]])[which(rownames(zig_res_P1[[2]]) %in% c(rownames(zig_res_P2[[2]]), rownames(zig_res_P3[[2]])))]
## [1] "GCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGTTCTGTAAGACAGATGTGAAATCCCCGGGCTCAACCTGGGAATTGCATTTGTGACTGCAGGACTAGAGTTCATCAGAGGGGGGTGGAATTCCAAGTGTAGCAGTGAAATGCGTAGATATTTGGAAGAACACCAATGGCGAAGGCAGCCCCCTGGGATGCGACTGACGCTCATGCACGAAAGCGTGGGG"
## [2] "GCGAGCGTTATCCGGATTCATTGGGCGTAAAGCGCGCGTAGGCGGCCCGGCAGGCCGGGGGTCGAAGCGGGGGGCTCAACCCCCCGAAGCCCCCGGAACCTCCGCGGCTTGGGTCCGGTAGGGGAGGGTGGAACACCCGGTGTAGCGGTGGAATGCGCAGATATCGGGTGGAACACCGGTGGCGAAGGCGGCCCTCTGGGCCGAGACCGACGCTGAGGCGCGAAAGCTGGGGG"
rownames(zig_res_P2[[2]])[which(rownames(zig_res_P2[[2]]) %in% rownames(zig_res_P3[[2]]))]
## character(0)

4.3 Visualization of significant taxa

### functions to plot
make_vis_plots <- function(ps_norm, grouping, tax, plot_type=c('box', 'bar')){
  # ps should be a normalized (DESeq or CSS) phyloseq object
  # grouping should match the column name in the sample_data
  # tax is a taxonomical bin id (ASV) in the counts table to plot
  
  # subset phyloseq object to select ASV of interest
  ps_filt=prune_taxa(taxa_names(ps_norm) %in% tax, ps_norm)
  
  # get normalized counts
  plot_table<-data.table(otu_table(ps_filt), keep.rownames=TRUE)[rn %in% tax]
  # add very small value, min/100000 to 0
  plot_table <- melt(plot_table, id.vars='rn')
  plot_table$value <- plot_table$value+min(plot_table[value!=0]$value)/100000
  
  # add metadata
  groupDT=data.table(data.frame(sample_data(ps_filt)[, c(grouping, 'Within.study.sampling.date')]), keep.rownames=TRUE)
  setnames(groupDT, 'rn', 'variable')
  plot_table <- merge(plot_table, groupDT, by='variable', all.x=TRUE)
  
  # change variable to general name
  setnames(plot_table, grouping, 'Group')

  # boxplot
  if(plot_type=='box'){
    ggplot(data=plot_table, aes(x=Within.study.sampling.date, y = value, fill=Group)) + 
      geom_boxplot(outlier.color=NA) +
      geom_jitter(position=position_jitterdodge(0.2), cex=1.5, color="gray44") + 
      labs(title =deparse(substitute(ps_norm)), x='', y ='Proportional counts, log scale') + 
      scale_y_log10() + 
      scale_fill_manual(values=sgColorPalette)+
      theme_minimal() + facet_wrap(~rn, scales='free', ncol=3)+
      theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
  } else if (plot_type=='bar'){
    plot_table2 <- plot_table[, list(mean_ct=mean(value), sem=sd(value)/sqrt(.N)), by=c('Group', 'Within.study.sampling.date', 'rn')]
    ggplot(data=plot_table2, aes(x=Within.study.sampling.date, y =mean_ct, fill=Group)) + 
      geom_bar(stat='identity', position=position_dodge()) +
      geom_errorbar(aes(ymin=mean_ct-sem, ymax=mean_ct+sem), width=0.2, position=position_dodge(0.9))+ 
      labs(title =deparse(substitute(ps_norm)), x='', y ='Proportional counts, 0 to 1 scale') + 
      #scale_y_log10() + 
      scale_fill_manual(values=sgColorPalette)+
      theme_minimal() + facet_wrap(~rn, scales='free', ncol=3)+
      theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
  }
}
######BOXPLOT of significant ones
# make significant taxa into one table so that all pvalues retained
significant_tax=NULL
significant_tax <- merge(data.table(deseq_res_P1[[2]], keep.rownames=TRUE)[, list(rn, deseq_P1_adjp=padj)],
                         data.table(deseq_res_P2[[2]], keep.rownames=TRUE)[, list(rn, deseq_P2_adjp=padj)],
                         by='rn', all=TRUE)
significant_tax <- merge(significant_tax,
                         data.table(deseq_res_P3[[2]], keep.rownames=TRUE)[, list(rn, deseq_P3_adjp=padj)],
                         by='rn', all=TRUE)
significant_tax <- merge(significant_tax,
                         data.table(bla[[2]], keep.rownames=TRUE)[, list(rn, deseq_timeseries_adjp=padj)],
                         by='rn', all=TRUE)
significant_tax <- merge(significant_tax,
                         data.table(zig_res_P1[[2]], keep.rownames=TRUE)[, list(rn, mtgseq_P1_adjp=adjPvalues)],
                         by='rn', all=TRUE)
significant_tax <- merge(significant_tax,
                         data.table(zig_res_P2[[2]], keep.rownames=TRUE)[, list(rn, mtgseq_P2_adjp=adjPvalues)],
                         by='rn', all=TRUE)
significant_tax <- merge(significant_tax,
                         data.table(zig_res_P3[[2]], keep.rownames=TRUE)[, list(rn, mtgseq_P3_adjp=adjPvalues)],
                         by='rn', all=TRUE)

# remove nothing
significant_tax <- significant_tax[rn!='nothing']

# write results
write.csv(significant_tax, file=paste0(output_data, 'Significant_res_deseq_q', deseq_cut, '_mtgseq_q', mtgseq_cut, '.csv'), row.names=FALSE)
          
datatable(significant_tax)
# also, find taxonomical annotations
# NOTE: single ASV may have multiple annotations due to tie hits
#Changing var all_tax_data to tax_table since I don't have this object since I don't have all_tax_data as a object,
datatable(tax_table(ps_not_norm_comp)[rownames(tax_table(ps_not_norm_comp)) %in% significant_tax$rn])

4.3.1 DESeq results

## plot
# common by deseq
com_deseq_taxa=significant_tax[!is.na(deseq_P1_adjp) & !is.na(deseq_P2_adjp) & !is.na(deseq_P3_adjp)]

if(nrow(com_deseq_taxa)>0){
  print(make_vis_plots(ps_TSS_norm_pass_min_postDD_sup003, 'phenotype', com_deseq_taxa$rn, 'box'))
} else {
  print('no common DESeq significant taxa')
}
## [1] "no common DESeq significant taxa"

4.3.2 DESeq timeseries results

# deseq timeseries
if(nrow(significant_tax[!is.na(deseq_timeseries_adjp)])>0){
  print(make_vis_plots(ps_TSS_norm_pass_min_postDD_sup003, 'phenotype', significant_tax[!is.na(deseq_timeseries_adjp)]$rn, 'box'))
  # plot bar as well
  print(make_vis_plots(ps_TSS_norm_pass_min_postDD_sup003, 'phenotype', significant_tax[!is.na(deseq_timeseries_adjp)]$rn, 'bar'))
} else {
  print('no DESeq timeseries significant taxa')
}

4.3.3 metagenomeSeq results

# common by metagenomeseq
com_mtgseq_taxa=significant_tax[!is.na(mtgseq_P1_adjp) & !is.na(mtgseq_P2_adjp) & !is.na(mtgseq_P3_adjp)]

if(nrow(com_mtgseq_taxa)>0){
  print(make_vis_plots(ps_TSS_norm_pass_min_postDD_sup003, 'phenotype', com_mtgseq_taxa$rn, 'box'))
} else {
  print('no common metagenomeSeq significant taxa')
}
## [1] "no common metagenomeSeq significant taxa"

5 CCA and visualization

Compare resulting amplicon data between and within sample types by canonical correlation analysis, regression profiling, and visualization (e.g. non-metric multi-dimensional scaling [NMDS], principle coordinates of analysis, principle component analysis).

5.1 Constrained by status A or N

plotting_phenotype_consPcoA <- function(ps,title){
  fam_6<-names(table(sample_data(ps)$Family.group.ID)[table(sample_data(ps)$Family.group.ID) == 6])
  ps_6fam<-prune_samples(sample_data(ps)$Family.group.ID %in% fam_6,ps )
  ps_pcoa_ord <- ordinate(
    physeq = ps_6fam, 
    method = "CAP", 
    distance = "bray",
    formula = ~ phenotype
    )
  p<-plot_ordination(
    physeq = ps_6fam, 
    ordination = ps_pcoa_ord, 
    color = "phenotype", 
    axes = c(1,2),
    title= paste("Constrained PcoA",title,"ordinated by phenotype with all timepoints")
    ) + 
    geom_point( size = 2) +
    scale_color_manual(values=sgColorPalette)+
    theme_minimal()+
    theme(text = element_text(size =10), plot.title = element_text(size=10))
  
    #sum_pcoA_DesEq<-summary(ps_pcoa_ord)
    erie_bray_sum_pcoA <- phyloseq::distance(ps, method = "bray")
    sampledf <- data.frame(sample_data(ps))
    beta_di<-betadisper(erie_bray_sum_pcoA, sampledf$Family.group.ID)
to_return<-list()
to_return[[1]]<-p
to_return[[2]]<-beta_di
  return(to_return)
  }

#With Deseq
DeSeq_distance<-plotting_phenotype_consPcoA(ps_DeSeq_norm_pass_min_postDD_sup003, "Deseq")
# plot
DeSeq_distance[[1]]

#same with CSS 
CSS_distance<-plotting_phenotype_consPcoA(ps_CSS_norm_pass_min_postDD_sup003, "CSS")
# plot
CSS_distance[[1]]

5.2 Constrained by family

#plotting
#Now we have: 803 taxa and 559 samples

#Looking at the family fro the complete set of samples

#Keeping the same ordination but filtering to the families with only 6 point to help vizualize the plot

#Looking at NORMALIZATION
plotting_Fam_consPcoA <- function(ps,title){
  fam_6<-names(table(sample_data(ps)$Family.group.ID)[table(sample_data(ps)$Family.group.ID) == 6])
  ps_6fam<-prune_samples(sample_data(ps)$Family.group.ID %in% fam_6,ps )
  sample_data(ps_6fam)$Family.group.ID <- paste0('fam', as.character(sample_data(ps_6fam)$Family.group.ID))
  ps_pcoa_ord <- ordinate(
    physeq = ps_6fam, 
    method = "CAP", 
    distance = "bray",
    formula = ~ Family.group.ID
    )
  p<-plot_ordination(
    physeq = ps_6fam, 
    ordination = ps_pcoa_ord, 
    color = "Family.group.ID", 
    axes = c(1,2),
    title= paste("Constrained PcoA",title,"ordinated by families with all timepoints")
    ) + 
    geom_point( size = 2) +
    theme_minimal()+
    theme(text = element_text(size =10), plot.title = element_text(size=10), legend.position='none')
  
    #sum_pcoA_DesEq<-summary(ps_pcoa_ord)
    erie_bray_sum_pcoA <- phyloseq::distance(ps, method = "bray")
    sampledf <- data.frame(sample_data(ps))
    beta_di<-betadisper(erie_bray_sum_pcoA, sampledf$Family.group.ID)
to_return<-list()
to_return[[1]]<-p
to_return[[2]]<-beta_di
  return(to_return)
  }

#With Deseq
DeSeq_distance<-plotting_Fam_consPcoA(ps_DeSeq_norm_pass_min_postDD_sup003, "Deseq")
# plot
DeSeq_distance[[1]]

#same with CSS 
CSS_distance<-plotting_Fam_consPcoA(ps_CSS_norm_pass_min_postDD_sup003, "CSS")
# plot
CSS_distance[[1]]

#the distance in those plot?
#average_distance_to_median
#pdf(file=paste0(output_data, "Figures/Distance_DeSeq_CSS_", Sys.Date(), ".pdf"))
boxplot(DeSeq_distance[[2]]$distances,CSS_distance[[2]]$distances, names=c("DeSeq", "CSS"), 
        xlab = "Type of Normalization", ylab = "Distance on Component 1 & 2", main ="Intragroup distance for each family")

#dev.off()

6 Diversity

Characterize and assess the diversity of each sample, and evaluate the extent of dissimilarity between the cohorts

6.1 Alpha diversity

ER <- estimate_richness(ps_not_norm_comp, measures=c("Observed", "Chao1", "Shannon"))
ER <- cbind(ER, sample_data(ps_not_norm_comp)[row.names(ER), c("phenotype", "Family.group.ID", "Within.study.sampling.date")])
ER <- data.table(ER, keep.rownames = TRUE)
ER <- melt(ER, id.vars=c('rn', 'phenotype', "Family.group.ID", "Within.study.sampling.date"))

# plot
ggplot(data=ER[variable!='se.chao1'], aes(x=phenotype, y=value, fill=phenotype))+
  geom_boxplot(width=0.7, outlier.colour='white')+
  geom_jitter(size=1, position=position_jitter(width=0.1))+
  xlab('')+ylab('')+
  scale_fill_manual(values=sgColorPalette)+
  theme_minimal()+facet_wrap(~variable, scales='free')

# run t-test to check significance
ttest=NULL
for(alphad in c('Observed', 'Chao1', 'Shannon')){
  ttest_res=t.test(value ~ phenotype, data=ER[variable==alphad], var.equal=TRUE)
  ttest=rbindlist(list(ttest, data.table(alpha_index=alphad, pvalue=ttest_res$p.value)))
}

pander(ttest)
alpha_index pvalue
Observed 0.2834
Chao1 0.2834
Shannon 0.7705

6.2 Beta diversity

#Let's do a PcoA #not much differences 
GP.ord <- ordinate(ps_DeSeq_norm_pass_min_postDD_sup003, "PCoA", "bray")
p2 = plot_ordination(ps_DeSeq_norm_pass_min_postDD_sup003, GP.ord, type="samples", color="phenotype") +
  geom_point( size = 1)+
  scale_color_manual(values=sgColorPalette)+
  theme_minimal()
p2

7 PERMANOVA

non- parametric statistical approaches (ANOSIM, ADONIS, ANOVA, PERMANOVA, etc.) will be employed to determine the significance of noteworthy factors, such as digital phenotype, probiotic and/or antibiotic use

7.1 PERMANOVA test

permanova <- function(physeq, factorName, ifnumeric, pmt=999){
  set.seed(1)
  braydist = phyloseq::distance(physeq, "bray")
  form <- as.formula(paste("braydist ~ ", c(factorName), sep = ""))
  metaDF=data.frame(sample_data(physeq)[, as.character(factorName)])
  # if numerical variable, make sure the class
  if(ifnumeric){
    metaDF[, factorName] <- as.numeric(metaDF[, factorName])
    factor_class='numeric'
  } else {
    factor_class='categorical'
  }
  perm <- adonis(form, permutations = pmt, metaDF)
  permDT=data.table(Variable=factorName, 
             FactorClass=factor_class,
             TotalN=perm$aov.tab['Total','Df']+1, 
             R2=perm$aov.tab[factorName, 'R2'], 
             pvalue=perm$aov.tab[factorName,'Pr(>F)'][1])
  return(permDT)
}
#betadispersion
#we keep only the cateory selected above as relevant 
tmp_metadat<-metadata_ok[,c(num_cat,fac_cat)]
#additionnal error to remove: not enough sample: 
tmp_metadat<-tmp_metadat[,-which(colnames(tmp_metadat) %in% c("Number.of.pet.reptiles","Number.of.pet.horses", "Pet.horse"))]
#additionnal error to remove: filled with only NA or one factor, cant do permutest on it due to adonis function requirements
col_levels<-sapply(tmp_metadat, levels)
col_levelscount<-sapply(col_levels, length)
tmp_metadat_1 <- tmp_metadat
#Since there are no numerics based on code below, will drop all that dont have 2 or more levels
#tmp_metadat[,which(sapply(tmp_metadat, class) == "numeric")]
tmp_metadat <- tmp_metadat[,which(col_levelscount >= 2)]

set.seed(1)
pval_factors_diper=c()
nb_samples_disper=c()
for (i in 1:length(tmp_metadat)){
  #cat (i,"\t")
  test_map<-tmp_metadat[!is.na(tmp_metadat[,i]) & tmp_metadat[,i] != "" ,]
  ps.tmp<-copy(ps_DeSeq_norm_pass_min_postDD_sup003)
  sample_data(ps.tmp) <- test_map
  df_metadata <- data.frame(sample_data(ps.tmp))
  df_metadata<-df_metadata[df_metadata[,colnames(test_map)[i]] != "",]
  # use prune_samples instead of subset_samples
  keepid=!is.na(get_variable(ps.tmp, colnames(test_map)[i])) &
    get_variable(ps.tmp, colnames(test_map)[i])!='' &
    get_variable(ps.tmp, colnames(test_map)[i])!='NA' 
  ps.tmp <- prune_samples(keepid, ps.tmp)
  #ps.tmp <- subset_samples(ps.tmp, colnames(test_map)[i] !="")
  tmp_nb_samples<-dim(otu_table(ps.tmp))[2]
  OTU_tables_bray <- phyloseq::distance(ps.tmp, method = "bray")
  beta <- betadisper(OTU_tables_bray, df_metadata[,colnames(test_map)[i]])
  tmp<-permutest(beta)
  tmp<-tmp$tab$`Pr(>F)`[1]
  pval_factors_diper<-c(pval_factors_diper,tmp)
  nb_samples_disper<-c(nb_samples_disper,tmp_nb_samples)}
#correct the p.value 
names(pval_factors_diper)<-colnames(tmp_metadat)
pval_factors_diper<-p.adjust(pval_factors_diper, method = "fdr")
to_remove_betadis<-names(pval_factors_diper)[pval_factors_diper<0.05]
  
# list of permanova variables
#meta_cat <- tibble(col_levelscount >= 2, colnames(tmp_metadat_1), sapply(tmp_metadat_1, class))
#rownames(meta_cat) <- colnames(tmp_metadat_1)
#colnames(meta_cat) <- c("permanova", "varname", "type")
#meta_cat$type <- gsub("factor", "Categorical", meta_cat$type)
#meta_cat$type <- gsub("numerical", "Continuous", meta_cat$type)

#meta_cat file listed phenotype as false for permanova, but I will add it back in)
meta_cat$permanova[which(meta_cat$varname == "phenotype")] <- "Categorical"
permanova_var=meta_cat[which(meta_cat$permanova!=FALSE),]

set.seed(1)
permanova_res=NULL
for(j in 1:nrow(permanova_var)){
    #print(factorName1)
    #pander(table(sample_data(ps_DeSeq_norm_pass_min_postDD_sup003)[, factorName1]))
  # variable name (as.characteradded)
  var_name=as.character(permanova_var$varname[j])
  # remove all NAs
  keepid=!is.na(get_variable(ps_DeSeq_norm_pass_min_postDD_sup003, var_name)) &
    get_variable(ps_DeSeq_norm_pass_min_postDD_sup003, var_name)!='NA' &
    get_variable(ps_DeSeq_norm_pass_min_postDD_sup003, var_name)!=''
  tmp_ps <- prune_samples(keepid, ps_DeSeq_norm_pass_min_postDD_sup003)
  
  # Check if there is more than 1 values (categories)
  if(uniqueN(sample_data(tmp_ps)[, var_name])>1){
    
    # if categorical
    if(permanova_var$permanova[j]=='Categorical'){
      # run permanova only if there are more than 1 groups
      p <- permanova(tmp_ps, factorName=var_name, ifnumeric=FALSE, pmt=999)
      permanova_res=rbindlist(list(permanova_res, p))
      rm(p)
    }
    # if continuous
    if(permanova_var$permanova[j]=='Continuous'){
      p <- permanova(tmp_ps, factorName=var_name, ifnumeric=TRUE, pmt=999)
      permanova_res=rbindlist(list(permanova_res, p))
      rm(p)
    }
  }
  rm(var_name)
}

# write
write.csv(permanova_res, file=paste0(output_data, 'PERMANOVA.csv'), row.names=FALSE)

# total number of variables tested
uniqueN(permanova_res$Variable)

[1] 124

# Factor class
pander(table(permanova_res$FactorClass))
categorical numeric
102 22
# number of significant variables
uniqueN(permanova_res[pvalue<permanova_pcut]$Variable)

[1] 112

#and now removing the ones with betadispersion significant 
impacting_compo<-setdiff(permanova_res[pvalue<permanova_pcut]$Variable,  to_remove_betadis)

#and now the ones also significant between the two cohorts
impacting_compo<-impacting_compo[impacting_compo %in% c(names(all_chisquare),"Age..months.")]
permanova_res<- permanova_res[permanova_res$Variable %in% impacting_compo,]
# sort
permanova_res <- permanova_res[order(R2, decreasing=TRUE)]
datatable(permanova_res)
write.csv(permanova_res, file=paste0(output_data, 'PERMANOV_betadis_imp_corhort.csv'), row.names=FALSE)

7.2 Visualize permanova significant variables on PCoA

# function to plot PCoA, only for higher R2 value 
imp_factors<-permanova_res$Variable[permanova_res$R2 > 0.01]

#ordination formula only working with one variable in formula...
ps_pcoa <- ordinate(
  physeq = ps_DeSeq_norm_pass_min_postDD_sup003, 
  method = "CAP", 
  distance = "bray",
  #Did not include Toilet.cover and Meat/Seafood Longitudinal, Fruit..consumption.frequency...longitudinal. and LR10.prediction..M3. due to NA missing values which does not allow for ordination
  formula = ~ Ready.to.eat.meals..consumption.frequency. + Dairy..consumption.frequency. +  Fruit..consumption.frequency. +  Age..months.)
    
title_prep<-imp_factors[-c(which(imp_factors %in% c("Toilet.cover..M3.", "Meats.and.seafood..consumption.frequency...longitudinal.", "LR10.prediction..M3.", "Fruit..consumption.frequency...longitudinal.")))]

to_plot=list()
for (i in 1:length(title_prep)){
  to_plot[[i]] <- plot_ordination(
  physeq = ps_DeSeq_norm_pass_min_postDD_sup003, 
  ordination = ps_pcoa, 
  color = title_prep[i], 
  axes = c(1,2),
  title=title_prep[i]
) + 
  geom_point( size = 0.5) +
  theme(text = element_text(size =20), plot.title = element_text(size=15))
}
to_plot[[5]]<-plot_ordination(physeq = ps_DeSeq_norm_pass_min_postDD_sup003, ordination = ps_pcoa, type="taxa",title ="Taxa") + theme(text = element_text(size =15))
lay <- rbind(c(1),
             c(2),
             c(3),
             c(4),
             c(5))


#pdf(paste0(output_data,"confounding_factors.pdf",width=16,height=40))
grid.arrange(grobs = to_plot, layout_matrix = lay)

#dev.off()
#Let's have a look at the plot 
plot_ordination(physeq = ps_DeSeq_norm_pass_min_postDD_sup003, ordination = ps_pcoa, type="taxa",title ="Taxa") + theme(text = element_text(size =8))

#ok let's try to find the spcies that show some importance in this PCA
taxa.to.select<-vegan::scores(ps_pcoa)$species
#now plot it with no name for visibilty
rownames(taxa.to.select)<-c()
s.arrow(taxa.to.select) #the taxa that influence the most the plots are above 0.25

taxa.to.select.to.rem<-vegan::scores(ps_pcoa)$species[abs(vegan::scores(ps_pcoa)$species[,1])>0.1 | abs(vegan::scores(ps_pcoa)$species[,2])>0.1,]
#any overlap with the 5 important? 
rownames(bla[[2]]) %in% taxa.to.select.to.rem #NOPE!
## [1] FALSE FALSE FALSE FALSE FALSE FALSE
# function to plot PCoA without NA points
wo_na_pcoa <- function(ps, pvar, ord_res){
  # ord_res: ordinated object
  
  keepid=!is.na(get_variable(ps, pvar)) &
    get_variable(ps, pvar)!='NA' &
    get_variable(ps, pvar)!=''
  tmp_ps <- prune_samples(keepid, ps)
  
  # get subset counts and metadata together
  DF <- cbind(ord_res$vectors[row.names(sample_data(tmp_ps)), 1:2], sample_data(tmp_ps)[, pvar])
  setnames(DF, pvar, 'testvar')
  
  # get eigenvalues
  eig=(ord_res$values$Eigenvalues/sum(ord_res$values$Eigenvalues))[1:2]*100
  
  p <- ggplot(data=DF, aes(x=Axis.1, y=Axis.2, colour=testvar))+
    geom_point(size=2)+
    ggtitle(pvar)+
    xlab(paste0('Axis.1 [', format(eig[1], digits=3), '%]'))+
    ylab(paste0('Axis.2 [', format(eig[2], digits=3), '%]'))+
    theme_minimal()+
    theme(legend.title=element_blank(), legend.position="bottom")
    
  print(p)
}

7.2.1 Digital phenotype

sample_data(ps_DeSeq_norm_pass_min_postDD_sup003)$Mobile.Autism.Risk.Assessment.Score <- as.numeric(sample_data(ps_DeSeq_norm_pass_min_postDD_sup003)$Mobile.Autism.Risk.Assessment.Score)

wo_na_pcoa(ps_DeSeq_norm_pass_min_postDD_sup003, 'Mobile.Autism.Risk.Assessment.Score', GP.ord)

7.2.2 Probiotics

wo_na_pcoa(ps_DeSeq_norm_pass_min_postDD_sup003, 'Probiotic..consumption.frequency.', GP.ord)

7.2.3 Antibiotics

# Anti.infective
wo_na_pcoa(ps_DeSeq_norm_pass_min_postDD_sup003, 'Anti.infective', GP.ord)

# Minimum.time.since.antibiotics
sample_data(ps_DeSeq_norm_pass_min_postDD_sup003)$Minimum.time.since.antibiotics <- as.numeric(sample_data(ps_DeSeq_norm_pass_min_postDD_sup003)$Minimum.time.since.antibiotics)
wo_na_pcoa(ps_DeSeq_norm_pass_min_postDD_sup003, 'Minimum.time.since.antibiotics', GP.ord)

7.2.4 All other passed R2 cutoff and significant

for(pvar in permanova_res[R2>permanova_cut & pvalue<permanova_pcut]$Variable){
wo_na_pcoa(ps_DeSeq_norm_pass_min_postDD_sup003, pvar, GP.ord)
}

8 Session information

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: CentOS Linux 8 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.3.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] phyloseq_1.30.0             biomformat_1.14.0          
##  [3] DESeq2_1.26.0               SummarizedExperiment_1.16.1
##  [5] DelayedArray_0.12.3         BiocParallel_1.20.1        
##  [7] matrixStats_0.56.0          GenomicRanges_1.38.0       
##  [9] GenomeInfoDb_1.22.1         IRanges_2.20.2             
## [11] S4Vectors_0.24.4            metagenomeSeq_1.28.2       
## [13] RColorBrewer_1.1-2          glmnet_4.0-2               
## [15] Matrix_1.2-18               limma_3.42.2               
## [17] Biobase_2.46.0              BiocGenerics_0.32.0        
## [19] vegan_2.5-6                 lattice_0.20-40            
## [21] permute_0.9-5               adegraphics_1.0-15         
## [23] gridExtra_2.3               DT_0.14                    
## [25] pander_0.6.3                ggplot2_3.3.0              
## [27] dplyr_0.8.5                 reshape2_1.4.3             
## [29] tidyr_1.0.2                 knitr_1.28                 
## [31] devtools_2.2.2              usethis_1.5.1              
## [33] data.table_1.12.8           tibble_3.0.2               
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.1.5        Hmisc_4.4-0            igraph_1.2.5          
##   [4] plyr_1.8.6             sp_1.4-1               splines_3.6.3         
##   [7] crosstalk_1.1.0.1      lpsymphony_1.16.0      digest_0.6.25         
##  [10] foreach_1.4.8          htmltools_0.4.0        gdata_2.18.0          
##  [13] fansi_0.4.1            magrittr_1.5           checkmate_2.0.0       
##  [16] memoise_1.1.0          cluster_2.1.0          remotes_2.1.1         
##  [19] Biostrings_2.54.0      annotate_1.64.0        prettyunits_1.1.1     
##  [22] jpeg_0.1-8.1           colorspace_1.4-1       blob_1.2.1            
##  [25] xfun_0.12              callr_3.4.2            crayon_1.3.4          
##  [28] RCurl_1.98-1.2         jsonlite_1.6.1         genefilter_1.68.0     
##  [31] ape_5.4                survival_3.2-3         iterators_1.0.12      
##  [34] glue_1.3.2             gtable_0.3.0           zlibbioc_1.32.0       
##  [37] XVector_0.26.0         pkgbuild_1.0.6         Rhdf5lib_1.8.0        
##  [40] shape_1.4.4            scales_1.1.0           DBI_1.1.0             
##  [43] IHW_1.14.0             Rcpp_1.0.4             xtable_1.8-4          
##  [46] htmlTable_1.13.3       foreign_0.8-76         bit_1.1-15.2          
##  [49] Formula_1.2-3          htmlwidgets_1.5.1      gplots_3.0.4          
##  [52] acepack_1.4.1          ellipsis_0.3.0         farver_2.0.3          
##  [55] pkgconfig_2.0.3        XML_3.99-0.3           nnet_7.3-13           
##  [58] locfit_1.5-9.4         labeling_0.3           tidyselect_1.0.0      
##  [61] rlang_0.4.6            AnnotationDbi_1.48.0   munsell_0.5.0         
##  [64] tools_3.6.3            cli_2.0.2              RSQLite_2.2.0         
##  [67] ade4_1.7-15            fdrtool_1.2.15         evaluate_0.14         
##  [70] stringr_1.4.0          yaml_2.2.1             processx_3.4.2        
##  [73] bit64_0.9-7            fs_1.3.2               caTools_1.18.0        
##  [76] purrr_0.3.3            nlme_3.1-145           slam_0.1-47           
##  [79] compiler_3.6.3         rstudioapi_0.11        png_0.1-7             
##  [82] testthat_2.3.2         geneplotter_1.64.0     stringi_1.4.6         
##  [85] ps_1.3.2               desc_1.2.0             multtest_2.42.0       
##  [88] vctrs_0.3.1            pillar_1.4.3           lifecycle_0.2.0       
##  [91] bitops_1.0-6           R6_2.4.1               latticeExtra_0.6-29   
##  [94] KernSmooth_2.23-16     sessioninfo_1.1.1      codetools_0.2-16      
##  [97] MASS_7.3-51.5          gtools_3.8.1           assertthat_0.2.1      
## [100] pkgload_1.0.2          Wrench_1.4.0           rhdf5_2.30.1          
## [103] rprojroot_1.3-2        withr_2.1.2            GenomeInfoDbData_1.2.2
## [106] mgcv_1.8-31            grid_3.6.3             rpart_4.1-15          
## [109] rmarkdown_2.1          base64enc_0.1-3